import pandas as pd
linkData="C:\\Users\\LENOVO\\Desktop\\Semana 10\\covid_contagiados.csv"
covid = pd.read_csv(linkData)
covid.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4585360 entries, 0 to 4585359 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 FECHA_CORTE int64 1 DEPARTAMENTO object 2 PROVINCIA object 3 DISTRITO object 4 METODODX object 5 EDAD float64 6 SEXO object 7 FECHA_RESULTADO float64 8 UBIGEO float64 9 id_persona float64 dtypes: float64(4), int64(1), object(5) memory usage: 349.8+ MB
covid.head()
| FECHA_CORTE | DEPARTAMENTO | PROVINCIA | DISTRITO | METODODX | EDAD | SEXO | FECHA_RESULTADO | UBIGEO | id_persona | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20241203 | TUMBES | TUMBES | TUMBES | AG | 46.0 | FEMENINO | 20221207.0 | 240101.0 | 203499.0 |
| 1 | 20241203 | LIMA | LIMA | JESUS MARIA | AG | 69.0 | FEMENINO | 20230822.0 | 150113.0 | 221397.0 |
| 2 | 20241203 | SAN MARTIN | MOYOBAMBA | MOYOBAMBA | AG | 55.0 | FEMENINO | 20240108.0 | 220101.0 | 295651.0 |
| 3 | 20241203 | AREQUIPA | CAYLLOMA | COPORAQUE | AG | 50.0 | MASCULINO | 20230824.0 | 40506.0 | 851625.0 |
| 4 | 20241203 | LIMA | LIMA | JESUS MARIA | AG | 58.0 | MASCULINO | 20221217.0 | 150113.0 | 287786.0 |
covid['FECHA_RESULTADO'] = covid['FECHA_RESULTADO'].astype(str).str[:-4]
# Mostrar las primeras filas para verificar el cambio
print(covid[['FECHA_RESULTADO']].head())
FECHA_RESULTADO 0 202212 1 202308 2 202401 3 202308 4 202212
# years in data
covid.FECHA_RESULTADO.value_counts()
FECHA_RESULTADO
202201 1010098
202207 286577
202008 243265
202104 242400
202103 235429
202202 216536
202102 195843
202212 184993
202208 184170
202009 157189
202105 148467
202007 144012
202101 133867
202005 131513
202006 121560
202211 113985
202106 95254
202010 81303
202112 60846
202107 59960
202206 53342
202011 48244
202012 47717
202004 46500
202111 41644
202108 35360
202109 30043
202110 28762
202209 26748
202312 26289
202401 24721
202203 24168
202301 17829
202205 12933
202204 10337
202304 9121
202210 8126
202303 7589
202305 6163
202311 5569
202306 5403
202302 4213
202307 3214
202309 3147
202310 2829
202402 2274
2023
202308 1995
202003 1317
189912 394
202403 79
Name: count, dtype: int64
Eliminamos datos NAs de "FECHA_RESULTADO":
covid['FECHA_RESULTADO'] = pd.to_numeric(covid['FECHA_RESULTADO'], errors='coerce')
covid = covid.dropna(subset=['FECHA_RESULTADO'])
Recodificamos la variable a numérica para poder filtrarla:
covid['FECHA_RESULTADO'] = covid['FECHA_RESULTADO'].astype(int)
Leemos el subset:
covid = covid[(covid.FECHA_RESULTADO >= 202000) &
(covid.FECHA_RESULTADO <= 202199) &
(covid.PROVINCIA != "EN INVESTIGACIÓN")]
covid.FECHA_RESULTADO.value_counts()
FECHA_RESULTADO 202008 230258 202104 229819 202103 221141 202102 183990 202009 147985 202105 141260 202007 135799 202005 129951 202101 126279 202006 116148 202106 90488 202010 76135 202112 59152 202107 57014 202004 45911 202011 45491 202012 44705 202111 39128 202108 34134 202109 28235 202110 26903 202003 1302 Name: count, dtype: int64
Recategorizamos
labels = ["Niños", "Adolescentes", "Jovenes_Adultos", "Adultos_Edad_Media", "Adultos_Mayores"]
covid['CATEGORIA_EDAD'] = pd.cut(covid['EDAD'], bins=[0, 9, 19, 39, 59, float('inf')], labels=labels)
Tenemos la distribución de contagiados por edad:
covid.CATEGORIA_EDAD.value_counts()
CATEGORIA_EDAD Jovenes_Adultos 907276 Adultos_Edad_Media 747203 Adultos_Mayores 376492 Adolescentes 128428 Niños 43591 Name: count, dtype: int64
Vemos las posibilidades:
#Por si queremos ver el fenómeno quitando a los Niños
covid_contagio=covid[covid.CATEGORIA_EDAD!='Niños']
covid_contagio.head()
| FECHA_CORTE | DEPARTAMENTO | PROVINCIA | DISTRITO | METODODX | EDAD | SEXO | FECHA_RESULTADO | UBIGEO | id_persona | CATEGORIA_EDAD | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 20241203 | TUMBES | ZARUMILLA | AGUAS VERDES | AG | 52.0 | MASCULINO | 202012 | 240302.0 | 233066.0 | Adultos_Edad_Media |
| 10 | 20241203 | TUMBES | ZARUMILLA | ZARUMILLA | AG | 42.0 | FEMENINO | 202012 | 240301.0 | 307679.0 | Adultos_Edad_Media |
| 27 | 20241203 | TACNA | TACNA | ALTO DE LA ALIANZA | AG | 58.0 | FEMENINO | 202101 | 230102.0 | 256563.0 | Adultos_Edad_Media |
| 31 | 20241203 | SAN MARTIN | SAN MARTIN | TARAPOTO | AG | 48.0 | MASCULINO | 202012 | 220901.0 | 301845.0 | Adultos_Edad_Media |
| 38 | 20241203 | TACNA | TACNA | TACNA | AG | 45.0 | FEMENINO | 202101 | 230101.0 | 348484.0 | Adultos_Edad_Media |
Reshaping to Long¶
CONTAGIADOS¶
Personas contagiadas por provincia, por departamento, por año y mes:
indexList=['FECHA_RESULTADO','DEPARTAMENTO','PROVINCIA', 'CATEGORIA_EDAD']
aggregator={'CATEGORIA_EDAD':[len]}
covid_provYear=covid_contagio.groupby(indexList,observed=True).agg(aggregator)
covid_provYear
| CATEGORIA_EDAD | ||||
|---|---|---|---|---|
| len | ||||
| FECHA_RESULTADO | DEPARTAMENTO | PROVINCIA | CATEGORIA_EDAD | |
| 202003 | ANCASH | HUARAZ | Adolescentes | 1 |
| SANTA | Jovenes_Adultos | 2 | ||
| Adultos_Edad_Media | 9 | |||
| Adultos_Mayores | 1 | |||
| AREQUIPA | AREQUIPA | Adolescentes | 2 | |
| ... | ... | ... | ... | ... |
| 202112 | UCAYALI | CORONEL PORTILLO | Adultos_Mayores | 25 |
| PADRE ABAD | Adolescentes | 13 | ||
| Jovenes_Adultos | 35 | |||
| Adultos_Edad_Media | 17 | |||
| Adultos_Mayores | 5 |
14952 rows × 1 columns
Mandamos los counts a las "wide columns":
covidDraft=covid_provYear.unstack(3).fillna(0) #leftmost index in rows
covidDraft
| CATEGORIA_EDAD | ||||||
|---|---|---|---|---|---|---|
| len | ||||||
| CATEGORIA_EDAD | Adolescentes | Jovenes_Adultos | Adultos_Edad_Media | Adultos_Mayores | ||
| FECHA_RESULTADO | DEPARTAMENTO | PROVINCIA | ||||
| 202003 | ANCASH | HUARAZ | 1.0 | 0.0 | 0.0 | 0.0 |
| SANTA | 0.0 | 2.0 | 9.0 | 1.0 | ||
| AREQUIPA | AREQUIPA | 2.0 | 8.0 | 8.0 | 11.0 | |
| CARAVELI | 0.0 | 1.0 | 0.0 | 0.0 | ||
| AYACUCHO | HUAMANGA | 0.0 | 0.0 | 1.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... |
| 202112 | TUMBES | CONTRALMIRANTE VILLAR | 1.0 | 14.0 | 7.0 | 5.0 |
| TUMBES | 11.0 | 77.0 | 75.0 | 33.0 | ||
| ZARUMILLA | 6.0 | 24.0 | 23.0 | 12.0 | ||
| UCAYALI | CORONEL PORTILLO | 1.0 | 30.0 | 16.0 | 25.0 | |
| PADRE ABAD | 13.0 | 35.0 | 17.0 | 5.0 | ||
4061 rows × 4 columns
Cuota informática de covid, Adultos Mayores:
covidDraft['Adultos_Mayores_pct'] = covidDraft.iloc[:, 1] / (covidDraft.iloc[:, 0] + covidDraft.iloc[:, 1] + covidDraft.iloc[:, 2] + covidDraft.iloc[:, 3])
covid_provYear_Adultos_Mayores_w=covidDraft['Adultos_Mayores_pct'].unstack('FECHA_RESULTADO').fillna(0)
covid_provYear_Adultos_Mayores_w
| FECHA_RESULTADO | 202003 | 202004 | 202005 | 202006 | 202007 | 202008 | 202009 | 202010 | 202011 | 202012 | ... | 202103 | 202104 | 202105 | 202106 | 202107 | 202108 | 202109 | 202110 | 202111 | 202112 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| DEPARTAMENTO | PROVINCIA | |||||||||||||||||||||
| AMAZONAS | BAGUA | 0.00 | 0.250000 | 0.559524 | 0.355491 | 0.396988 | 0.364357 | 0.338021 | 0.369072 | 0.407407 | 0.425926 | ... | 0.480349 | 0.386986 | 0.382625 | 0.401180 | 0.381818 | 0.373626 | 0.443182 | 0.522613 | 0.453287 | 0.400631 |
| BONGARA | 0.00 | 0.333333 | 0.555556 | 0.238095 | 0.425926 | 0.349398 | 0.388889 | 0.275000 | 0.272727 | 0.590909 | ... | 0.436242 | 0.423237 | 0.442424 | 0.389610 | 0.548387 | 0.366667 | 0.533333 | 0.320000 | 0.187500 | 0.341463 | |
| CHACHAPOYAS | 0.00 | 0.482759 | 0.477528 | 0.557377 | 0.508197 | 0.476596 | 0.403448 | 0.547988 | 0.508772 | 0.553846 | ... | 0.449951 | 0.474308 | 0.468127 | 0.448387 | 0.463415 | 0.566667 | 0.592593 | 0.615385 | 0.555556 | 0.397351 | |
| CONDORCANQUI | 0.00 | 0.000000 | 0.333333 | 0.524064 | 0.470363 | 0.531183 | 0.520833 | 0.485577 | 0.504673 | 0.494505 | ... | 0.430769 | 0.472603 | 0.513761 | 0.488095 | 0.714286 | 0.500000 | 0.333333 | 0.368421 | 0.333333 | 0.454545 | |
| LUYA | 0.00 | 0.000000 | 0.700000 | 0.569892 | 0.565217 | 0.489051 | 0.461538 | 0.448276 | 0.562500 | 0.307692 | ... | 0.552381 | 0.435484 | 0.411168 | 0.393162 | 0.431373 | 0.422222 | 0.500000 | 0.235294 | 0.210526 | 0.433333 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TUMBES | ZARUMILLA | 0.25 | 0.368421 | 0.295652 | 0.345382 | 0.494565 | 0.381974 | 0.366762 | 0.362500 | 0.422680 | 0.525000 | ... | 0.413793 | 0.394813 | 0.389610 | 0.336735 | 0.442623 | 0.381818 | 0.440476 | 0.328358 | 0.255319 | 0.369231 |
| UCAYALI | ATALAYA | 0.00 | 0.000000 | 0.000000 | 0.571429 | 0.397959 | 0.436090 | 0.465753 | 0.482456 | 0.636364 | 0.333333 | ... | 0.372549 | 0.370370 | 0.405405 | 0.375000 | 0.166667 | 0.500000 | 0.333333 | 0.000000 | 0.333333 | 0.000000 |
| CORONEL PORTILLO | 0.00 | 0.438525 | 0.412000 | 0.406504 | 0.354535 | 0.341037 | 0.360943 | 0.327811 | 0.268139 | 0.296667 | ... | 0.407807 | 0.391324 | 0.371469 | 0.408946 | 0.413223 | 0.457627 | 0.488372 | 0.472222 | 0.346939 | 0.416667 | |
| PADRE ABAD | 0.00 | 0.250000 | 0.356589 | 0.410256 | 0.301435 | 0.358804 | 0.302632 | 0.430464 | 0.333333 | 0.432432 | ... | 0.432927 | 0.487500 | 0.382609 | 0.565217 | 0.476190 | 0.230769 | 0.250000 | 0.545455 | 0.533333 | 0.500000 | |
| PURUS | 0.00 | 0.000000 | 0.000000 | 0.000000 | 0.210526 | 0.315789 | 0.360656 | 0.367742 | 0.296296 | 0.000000 | ... | 0.348837 | 0.421053 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
196 rows × 22 columns
Notar el tipo de data
covid_provYear_Adultos_Mayores_w.columns
Index([202003, 202004, 202005, 202006, 202007, 202008, 202009, 202010, 202011,
202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108,
202109, 202110, 202111, 202112],
dtype='int32', name='FECHA_RESULTADO')
DeberÃamos tener texto y no números:
covid_provYear_Adultos_Mayores_w.columns=['date'+str(x) for x in covid_provYear_Adultos_Mayores_w.columns]
# then
covid_provYear_Adultos_Mayores_w
| date202003 | date202004 | date202005 | date202006 | date202007 | date202008 | date202009 | date202010 | date202011 | date202012 | ... | date202103 | date202104 | date202105 | date202106 | date202107 | date202108 | date202109 | date202110 | date202111 | date202112 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| DEPARTAMENTO | PROVINCIA | |||||||||||||||||||||
| AMAZONAS | BAGUA | 0.00 | 0.250000 | 0.559524 | 0.355491 | 0.396988 | 0.364357 | 0.338021 | 0.369072 | 0.407407 | 0.425926 | ... | 0.480349 | 0.386986 | 0.382625 | 0.401180 | 0.381818 | 0.373626 | 0.443182 | 0.522613 | 0.453287 | 0.400631 |
| BONGARA | 0.00 | 0.333333 | 0.555556 | 0.238095 | 0.425926 | 0.349398 | 0.388889 | 0.275000 | 0.272727 | 0.590909 | ... | 0.436242 | 0.423237 | 0.442424 | 0.389610 | 0.548387 | 0.366667 | 0.533333 | 0.320000 | 0.187500 | 0.341463 | |
| CHACHAPOYAS | 0.00 | 0.482759 | 0.477528 | 0.557377 | 0.508197 | 0.476596 | 0.403448 | 0.547988 | 0.508772 | 0.553846 | ... | 0.449951 | 0.474308 | 0.468127 | 0.448387 | 0.463415 | 0.566667 | 0.592593 | 0.615385 | 0.555556 | 0.397351 | |
| CONDORCANQUI | 0.00 | 0.000000 | 0.333333 | 0.524064 | 0.470363 | 0.531183 | 0.520833 | 0.485577 | 0.504673 | 0.494505 | ... | 0.430769 | 0.472603 | 0.513761 | 0.488095 | 0.714286 | 0.500000 | 0.333333 | 0.368421 | 0.333333 | 0.454545 | |
| LUYA | 0.00 | 0.000000 | 0.700000 | 0.569892 | 0.565217 | 0.489051 | 0.461538 | 0.448276 | 0.562500 | 0.307692 | ... | 0.552381 | 0.435484 | 0.411168 | 0.393162 | 0.431373 | 0.422222 | 0.500000 | 0.235294 | 0.210526 | 0.433333 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TUMBES | ZARUMILLA | 0.25 | 0.368421 | 0.295652 | 0.345382 | 0.494565 | 0.381974 | 0.366762 | 0.362500 | 0.422680 | 0.525000 | ... | 0.413793 | 0.394813 | 0.389610 | 0.336735 | 0.442623 | 0.381818 | 0.440476 | 0.328358 | 0.255319 | 0.369231 |
| UCAYALI | ATALAYA | 0.00 | 0.000000 | 0.000000 | 0.571429 | 0.397959 | 0.436090 | 0.465753 | 0.482456 | 0.636364 | 0.333333 | ... | 0.372549 | 0.370370 | 0.405405 | 0.375000 | 0.166667 | 0.500000 | 0.333333 | 0.000000 | 0.333333 | 0.000000 |
| CORONEL PORTILLO | 0.00 | 0.438525 | 0.412000 | 0.406504 | 0.354535 | 0.341037 | 0.360943 | 0.327811 | 0.268139 | 0.296667 | ... | 0.407807 | 0.391324 | 0.371469 | 0.408946 | 0.413223 | 0.457627 | 0.488372 | 0.472222 | 0.346939 | 0.416667 | |
| PADRE ABAD | 0.00 | 0.250000 | 0.356589 | 0.410256 | 0.301435 | 0.358804 | 0.302632 | 0.430464 | 0.333333 | 0.432432 | ... | 0.432927 | 0.487500 | 0.382609 | 0.565217 | 0.476190 | 0.230769 | 0.250000 | 0.545455 | 0.533333 | 0.500000 | |
| PURUS | 0.00 | 0.000000 | 0.000000 | 0.000000 | 0.210526 | 0.315789 | 0.360656 | 0.367742 | 0.296296 | 0.000000 | ... | 0.348837 | 0.421053 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
196 rows × 22 columns
# as usual
covid_provYear_Adultos_Mayores_w.reset_index(inplace=True)
covid_provYear_Adultos_Mayores_w
| DEPARTAMENTO | PROVINCIA | date202003 | date202004 | date202005 | date202006 | date202007 | date202008 | date202009 | date202010 | ... | date202103 | date202104 | date202105 | date202106 | date202107 | date202108 | date202109 | date202110 | date202111 | date202112 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AMAZONAS | BAGUA | 0.00 | 0.250000 | 0.559524 | 0.355491 | 0.396988 | 0.364357 | 0.338021 | 0.369072 | ... | 0.480349 | 0.386986 | 0.382625 | 0.401180 | 0.381818 | 0.373626 | 0.443182 | 0.522613 | 0.453287 | 0.400631 |
| 1 | AMAZONAS | BONGARA | 0.00 | 0.333333 | 0.555556 | 0.238095 | 0.425926 | 0.349398 | 0.388889 | 0.275000 | ... | 0.436242 | 0.423237 | 0.442424 | 0.389610 | 0.548387 | 0.366667 | 0.533333 | 0.320000 | 0.187500 | 0.341463 |
| 2 | AMAZONAS | CHACHAPOYAS | 0.00 | 0.482759 | 0.477528 | 0.557377 | 0.508197 | 0.476596 | 0.403448 | 0.547988 | ... | 0.449951 | 0.474308 | 0.468127 | 0.448387 | 0.463415 | 0.566667 | 0.592593 | 0.615385 | 0.555556 | 0.397351 |
| 3 | AMAZONAS | CONDORCANQUI | 0.00 | 0.000000 | 0.333333 | 0.524064 | 0.470363 | 0.531183 | 0.520833 | 0.485577 | ... | 0.430769 | 0.472603 | 0.513761 | 0.488095 | 0.714286 | 0.500000 | 0.333333 | 0.368421 | 0.333333 | 0.454545 |
| 4 | AMAZONAS | LUYA | 0.00 | 0.000000 | 0.700000 | 0.569892 | 0.565217 | 0.489051 | 0.461538 | 0.448276 | ... | 0.552381 | 0.435484 | 0.411168 | 0.393162 | 0.431373 | 0.422222 | 0.500000 | 0.235294 | 0.210526 | 0.433333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 191 | TUMBES | ZARUMILLA | 0.25 | 0.368421 | 0.295652 | 0.345382 | 0.494565 | 0.381974 | 0.366762 | 0.362500 | ... | 0.413793 | 0.394813 | 0.389610 | 0.336735 | 0.442623 | 0.381818 | 0.440476 | 0.328358 | 0.255319 | 0.369231 |
| 192 | UCAYALI | ATALAYA | 0.00 | 0.000000 | 0.000000 | 0.571429 | 0.397959 | 0.436090 | 0.465753 | 0.482456 | ... | 0.372549 | 0.370370 | 0.405405 | 0.375000 | 0.166667 | 0.500000 | 0.333333 | 0.000000 | 0.333333 | 0.000000 |
| 193 | UCAYALI | CORONEL PORTILLO | 0.00 | 0.438525 | 0.412000 | 0.406504 | 0.354535 | 0.341037 | 0.360943 | 0.327811 | ... | 0.407807 | 0.391324 | 0.371469 | 0.408946 | 0.413223 | 0.457627 | 0.488372 | 0.472222 | 0.346939 | 0.416667 |
| 194 | UCAYALI | PADRE ABAD | 0.00 | 0.250000 | 0.356589 | 0.410256 | 0.301435 | 0.358804 | 0.302632 | 0.430464 | ... | 0.432927 | 0.487500 | 0.382609 | 0.565217 | 0.476190 | 0.230769 | 0.250000 | 0.545455 | 0.533333 | 0.500000 |
| 195 | UCAYALI | PURUS | 0.00 | 0.000000 | 0.000000 | 0.000000 | 0.210526 | 0.315789 | 0.360656 | 0.367742 | ... | 0.348837 | 0.421053 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
196 rows × 24 columns
Llamamos al mapa:
mapLink='https://github.com/SocialAnalytics-StrategicIntelligence/GeoDF_Analytics/raw/main/maps/ProvsINEI2023.zip'
import geopandas as gpd
provmap=gpd.read_file(mapLink)
provmap.info()
<class 'geopandas.geodataframe.GeoDataFrame'> RangeIndex: 196 entries, 0 to 195 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OBJECTID 196 non-null float64 1 CCDD 196 non-null object 2 CCPP 196 non-null object 3 DEPARTAMEN 196 non-null object 4 PROVINCIA 196 non-null object 5 geometry 196 non-null geometry dtypes: float64(1), geometry(1), object(4) memory usage: 9.3+ KB
Creamos una columna para concatenar dos:
provmap['location']=['+'.join(x[0]) for x in zip(provmap.iloc[:,3:5].values)]
provmap.head(10)
| OBJECTID | CCDD | CCPP | DEPARTAMEN | PROVINCIA | geometry | location | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 01 | 01 | AMAZONAS | CHACHAPOYAS | POLYGON ((-77.72614 -5.94354, -77.72486 -5.943... | AMAZONAS+CHACHAPOYAS |
| 1 | 2.0 | 01 | 02 | AMAZONAS | BAGUA | POLYGON ((-78.61909 -4.51001, -78.61802 -4.510... | AMAZONAS+BAGUA |
| 2 | 3.0 | 01 | 03 | AMAZONAS | BONGARA | POLYGON ((-77.72759 -5.1403, -77.72361 -5.1406... | AMAZONAS+BONGARA |
| 3 | 4.0 | 01 | 04 | AMAZONAS | CONDORCANQUI | POLYGON ((-77.81399 -2.99278, -77.81483 -2.995... | AMAZONAS+CONDORCANQUI |
| 4 | 5.0 | 01 | 05 | AMAZONAS | LUYA | POLYGON ((-78.13023 -5.9037, -78.13011 -5.9041... | AMAZONAS+LUYA |
| 5 | 6.0 | 01 | 06 | AMAZONAS | RODRIGUEZ DE MENDOZA | POLYGON ((-77.44452 -6.05002, -77.44387 -6.050... | AMAZONAS+RODRIGUEZ DE MENDOZA |
| 6 | 7.0 | 01 | 07 | AMAZONAS | UTCUBAMBA | POLYGON ((-78.09288 -5.36258, -78.09288 -5.364... | AMAZONAS+UTCUBAMBA |
| 7 | 8.0 | 02 | 01 | ANCASH | HUARAZ | POLYGON ((-77.3987 -9.35563, -77.39852 -9.3560... | ANCASH+HUARAZ |
| 8 | 9.0 | 02 | 02 | ANCASH | AIJA | POLYGON ((-77.61368 -9.649, -77.61241 -9.64975... | ANCASH+AIJA |
| 9 | 10.0 | 02 | 03 | ANCASH | ANTONIO RAYMONDI | POLYGON ((-77.08856 -8.97496, -77.08804 -8.975... | ANCASH+ANTONIO RAYMONDI |
Hacemos lo mismo con el data frame:
covid_provYear_Adultos_Mayores_w['location']=['+'.join(x[0]) for x in zip(covid_provYear_Adultos_Mayores_w.iloc[:,:2].values)]
covid_provYear_Adultos_Mayores_w.head()
| DEPARTAMENTO | PROVINCIA | date202003 | date202004 | date202005 | date202006 | date202007 | date202008 | date202009 | date202010 | ... | date202104 | date202105 | date202106 | date202107 | date202108 | date202109 | date202110 | date202111 | date202112 | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AMAZONAS | BAGUA | 0.0 | 0.250000 | 0.559524 | 0.355491 | 0.396988 | 0.364357 | 0.338021 | 0.369072 | ... | 0.386986 | 0.382625 | 0.401180 | 0.381818 | 0.373626 | 0.443182 | 0.522613 | 0.453287 | 0.400631 | AMAZONAS+BAGUA |
| 1 | AMAZONAS | BONGARA | 0.0 | 0.333333 | 0.555556 | 0.238095 | 0.425926 | 0.349398 | 0.388889 | 0.275000 | ... | 0.423237 | 0.442424 | 0.389610 | 0.548387 | 0.366667 | 0.533333 | 0.320000 | 0.187500 | 0.341463 | AMAZONAS+BONGARA |
| 2 | AMAZONAS | CHACHAPOYAS | 0.0 | 0.482759 | 0.477528 | 0.557377 | 0.508197 | 0.476596 | 0.403448 | 0.547988 | ... | 0.474308 | 0.468127 | 0.448387 | 0.463415 | 0.566667 | 0.592593 | 0.615385 | 0.555556 | 0.397351 | AMAZONAS+CHACHAPOYAS |
| 3 | AMAZONAS | CONDORCANQUI | 0.0 | 0.000000 | 0.333333 | 0.524064 | 0.470363 | 0.531183 | 0.520833 | 0.485577 | ... | 0.472603 | 0.513761 | 0.488095 | 0.714286 | 0.500000 | 0.333333 | 0.368421 | 0.333333 | 0.454545 | AMAZONAS+CONDORCANQUI |
| 4 | AMAZONAS | LUYA | 0.0 | 0.000000 | 0.700000 | 0.569892 | 0.565217 | 0.489051 | 0.461538 | 0.448276 | ... | 0.435484 | 0.411168 | 0.393162 | 0.431373 | 0.422222 | 0.500000 | 0.235294 | 0.210526 | 0.433333 | AMAZONAS+LUYA |
5 rows × 25 columns
Preprocessing¶
Los nombres de paÃses que no están escritos en inglés pueden venir con algunos sÃmbolos que pueden causar problemas (', ~). Deshagámonos de ellos:
import unidecode
byePunctuation=lambda x: unidecode.unidecode(x)
covid_provYear_Adultos_Mayores_w['location']=covid_provYear_Adultos_Mayores_w['location'].apply(byePunctuation)
provmap['location']=provmap['location'].apply(byePunctuation)
EstarÃa bien asegurarse de que no aparezca ningún "ghost" entre las palabras:
# replacing dashes and multiple spaces by a simple space
covid_provYear_Adultos_Mayores_w['location'] = covid_provYear_Adultos_Mayores_w.location.str.replace(r"\-|\_|\s+", "", regex=True)
provmap['location'] = provmap.location.str.replace(r"\-|\_|\s+", "", regex=True)
Merging¶
Necesitamos fusionar ambas tablas ahora. Eso puede suceder efectivamente si ambas tablas tienen una columna clave: una columna (o colección de ellas) cuyos valores en una tabla son los mismos en la otra.
No es necesario que la coincidencia sea exacta, pero solo se fusionan los valores comunes en la clave/llave.
Averigüemos qué NO coincide en cada tabla:
nomatch_df=set(covid_provYear_Adultos_Mayores_w.location)- set(provmap.location)
nomatch_gdf=set(provmap.location)-set(covid_provYear_Adultos_Mayores_w.location)
Esto es lo que no coincide:
len(nomatch_df), len(nomatch_gdf)
(2, 2)
La forma correcta de hacerlo es utilizar la fuzzy merging (recuerde que necesitamos the fuzz):
# pick the closest match from nomatch_gdf for a value in nomatch_df
!pip install thefuzz
from thefuzz import process
[(dis,process.extractOne(dis,nomatch_gdf)) for dis in sorted(nomatch_df)]
Requirement already satisfied: thefuzz in c:\users\lenovo\anaconda3\lib\site-packages (0.22.1) Requirement already satisfied: rapidfuzz<4.0.0,>=3.0.0 in c:\users\lenovo\anaconda3\lib\site-packages (from thefuzz) (3.10.1)
[('ANCASH+ANTONIORAIMONDI', ('ANCASH+ANTONIORAYMONDI', 95)),
('ICA+NAZCA', ('ICA+NASCA', 89))]
Si te sientes cómodo, prepara un diccionario de cambios:
# is this OK?
{dis:process.extractOne(dis,nomatch_gdf)[0] for dis in sorted(nomatch_df)}
{'ANCASH+ANTONIORAIMONDI': 'ANCASH+ANTONIORAYMONDI', 'ICA+NAZCA': 'ICA+NASCA'}
# then:
changesinDF={dis:process.extractOne(dis,nomatch_gdf)[0] for dis in sorted(nomatch_df)}
Ahora, hacemos reemplazos:
covid_provYear_Adultos_Mayores_w.replace({'location': changesinDF}, inplace=True)
¿Se acabó?
nomatch_df=set(covid_provYear_Adultos_Mayores_w.location)- set(provmap.location)
nomatch_gdf=set(provmap.location)-set(covid_provYear_Adultos_Mayores_w.location)
[(dis,process.extractOne(dis,nomatch_gdf)) for dis in sorted(nomatch_df)]
[]
Ahora el merge puede hacerse:
covid_provYear_Adultos_Mayores_map=provmap.merge(covid_provYear_Adultos_Mayores_w, on='location',how='left',indicator='flag')
# check
covid_provYear_Adultos_Mayores_map.info()
<class 'geopandas.geodataframe.GeoDataFrame'> RangeIndex: 196 entries, 0 to 195 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OBJECTID 196 non-null float64 1 CCDD 196 non-null object 2 CCPP 196 non-null object 3 DEPARTAMEN 196 non-null object 4 PROVINCIA_x 196 non-null object 5 geometry 196 non-null geometry 6 location 196 non-null object 7 DEPARTAMENTO 196 non-null object 8 PROVINCIA_y 196 non-null object 9 date202003 196 non-null float64 10 date202004 196 non-null float64 11 date202005 196 non-null float64 12 date202006 196 non-null float64 13 date202007 196 non-null float64 14 date202008 196 non-null float64 15 date202009 196 non-null float64 16 date202010 196 non-null float64 17 date202011 196 non-null float64 18 date202012 196 non-null float64 19 date202101 196 non-null float64 20 date202102 196 non-null float64 21 date202103 196 non-null float64 22 date202104 196 non-null float64 23 date202105 196 non-null float64 24 date202106 196 non-null float64 25 date202107 196 non-null float64 26 date202108 196 non-null float64 27 date202109 196 non-null float64 28 date202110 196 non-null float64 29 date202111 196 non-null float64 30 date202112 196 non-null float64 31 flag 196 non-null category dtypes: category(1), float64(23), geometry(1), object(7) memory usage: 47.9+ KB
# avoid poblems with fillna()
covid_provYear_Adultos_Mayores_map['flag']=covid_provYear_Adultos_Mayores_map.flag.astype(str)
Podemos deshacernos de algunas columnas:
bye=['DEPARTAMENTO', 'PROVINCIA_y', 'CCPP','CCDD']
covid_provYear_Adultos_Mayores_map.drop(columns=bye,inplace=True)
# keeping
covid_provYear_Adultos_Mayores_map.head()
| OBJECTID | DEPARTAMEN | PROVINCIA_x | geometry | location | date202003 | date202004 | date202005 | date202006 | date202007 | ... | date202104 | date202105 | date202106 | date202107 | date202108 | date202109 | date202110 | date202111 | date202112 | flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | AMAZONAS | CHACHAPOYAS | POLYGON ((-77.72614 -5.94354, -77.72486 -5.943... | AMAZONAS+CHACHAPOYAS | 0.0 | 0.482759 | 0.477528 | 0.557377 | 0.508197 | ... | 0.474308 | 0.468127 | 0.448387 | 0.463415 | 0.566667 | 0.592593 | 0.615385 | 0.555556 | 0.397351 | both |
| 1 | 2.0 | AMAZONAS | BAGUA | POLYGON ((-78.61909 -4.51001, -78.61802 -4.510... | AMAZONAS+BAGUA | 0.0 | 0.250000 | 0.559524 | 0.355491 | 0.396988 | ... | 0.386986 | 0.382625 | 0.401180 | 0.381818 | 0.373626 | 0.443182 | 0.522613 | 0.453287 | 0.400631 | both |
| 2 | 3.0 | AMAZONAS | BONGARA | POLYGON ((-77.72759 -5.1403, -77.72361 -5.1406... | AMAZONAS+BONGARA | 0.0 | 0.333333 | 0.555556 | 0.238095 | 0.425926 | ... | 0.423237 | 0.442424 | 0.389610 | 0.548387 | 0.366667 | 0.533333 | 0.320000 | 0.187500 | 0.341463 | both |
| 3 | 4.0 | AMAZONAS | CONDORCANQUI | POLYGON ((-77.81399 -2.99278, -77.81483 -2.995... | AMAZONAS+CONDORCANQUI | 0.0 | 0.000000 | 0.333333 | 0.524064 | 0.470363 | ... | 0.472603 | 0.513761 | 0.488095 | 0.714286 | 0.500000 | 0.333333 | 0.368421 | 0.333333 | 0.454545 | both |
| 4 | 5.0 | AMAZONAS | LUYA | POLYGON ((-78.13023 -5.9037, -78.13011 -5.9041... | AMAZONAS+LUYA | 0.0 | 0.000000 | 0.700000 | 0.569892 | 0.565217 | ... | 0.435484 | 0.411168 | 0.393162 | 0.431373 | 0.422222 | 0.500000 | 0.235294 | 0.210526 | 0.433333 | both |
5 rows × 28 columns
# filling with zeroes
covid_provYear_Adultos_Mayores_map.fillna(0,inplace=True)
Podemos guardar este geoDF:
import os
covid_provYear_Adultos_Mayores_map.to_file(os.path.join('maps',"provinciasPeru.gpkg"), layer='provinciasDengue', driver="GPKG")
Exploramos una variable¶
Esta vez, exploramos estadÃsticamenete una variable en el mapa
# statistics
covid_provYear_Adultos_Mayores_map.date202101.describe()
count 196.000000 mean 0.403255 std 0.124009 min 0.000000 25% 0.357143 50% 0.418011 75% 0.460248 max 1.000000 Name: date202101, dtype: float64
Una mirada visual:
import seaborn as sea
sea.boxplot(covid_provYear_Adultos_Mayores_map.date202101, color='yellow',orient='h')
<Axes: xlabel='date202101'>
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=100, random_state=0,output_distribution='normal')
qt_result=qt.fit_transform(covid_provYear_Adultos_Mayores_map[['date202101']])
sea.boxplot(qt_result, color='yellow',orient='h')
<Axes: >
covid_provYear_Adultos_Mayores_map['date202101_qt']=qt_result
#!pip install libpysal
from libpysal.weights import Queen, Rook, KNN
# rook
w_rook = Rook.from_dataframe(covid_provYear_Adultos_Mayores_map,use_index=False)
# rook
w_queen = Queen.from_dataframe(covid_provYear_Adultos_Mayores_map,use_index=False)
# k nearest neighbors
w_knn = KNN.from_dataframe(covid_provYear_Adultos_Mayores_map, k=8)
Entendamos las diferencias:
# first one
covid_provYear_Adultos_Mayores_map.head(1)
| OBJECTID | DEPARTAMEN | PROVINCIA_x | geometry | location | date202003 | date202004 | date202005 | date202006 | date202007 | ... | date202105 | date202106 | date202107 | date202108 | date202109 | date202110 | date202111 | date202112 | flag | date202101_qt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | AMAZONAS | CHACHAPOYAS | POLYGON ((-77.72614 -5.94354, -77.72486 -5.943... | AMAZONAS+CHACHAPOYAS | 0.0 | 0.482759 | 0.477528 | 0.557377 | 0.508197 | ... | 0.468127 | 0.448387 | 0.463415 | 0.566667 | 0.592593 | 0.615385 | 0.555556 | 0.397351 | both | 0.775034 |
1 rows × 29 columns
# amount neighbors of that district
w_rook.neighbors[0]
[2, 114, 4, 5, 180, 182, 55]
# see
base=covid_provYear_Adultos_Mayores_map[covid_provYear_Adultos_Mayores_map.PROVINCIA_x=="CHACHAPOYAS"].plot()
covid_provYear_Adultos_Mayores_map.iloc[w_rook.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
covid_provYear_Adultos_Mayores_map.head(1).plot(ax=base,facecolor="red")
<Axes: >
Hagamos lo mismo:
w_queen.neighbors[0]
[2, 114, 4, 5, 180, 182, 55]
base=covid_provYear_Adultos_Mayores_map[covid_provYear_Adultos_Mayores_map.PROVINCIA_x=="CHACHAPOYAS"].plot()
covid_provYear_Adultos_Mayores_map.iloc[w_queen.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
covid_provYear_Adultos_Mayores_map.head(1).plot(ax=base,facecolor="red")
<Axes: >
w_knn.neighbors[0]
[5, 4, 55, 182, 2, 59, 175, 114]
base=covid_provYear_Adultos_Mayores_map[covid_provYear_Adultos_Mayores_map.PROVINCIA_x=="CHACHAPOYAS"].plot()
covid_provYear_Adultos_Mayores_map.iloc[w_knn.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
covid_provYear_Adultos_Mayores_map.head(1).plot(ax=base,facecolor="red")
<Axes: >
Prestamos atención a los resultados de queen:
# all the neighbors by row
w_queen.neighbors
{0: [2, 114, 4, 5, 180, 182, 55],
1: [3, 60, 61, 6],
2: [0, 143, 3, 4, 6, 182, 175],
3: [1, 2, 6, 143],
4: [0, 2, 6, 55, 56],
5: [0, 178, 180, 182, 175],
6: [1, 2, 3, 4, 56, 58, 60],
7: [16, 17, 23, 8, 26, 12, 14],
8: [17, 7, 23],
9: [16, 90, 13],
10: [16, 26, 12, 13],
11: [96, 129, 128, 16, 17, 20, 23, 89, 91],
12: [16, 26, 10, 7],
13: [16, 19, 26, 90, 9, 10],
14: [24, 17, 26, 7],
15: [24, 25, 18, 21],
16: [7, 9, 10, 11, 12, 13, 23, 90, 91],
17: [128, 23, 7, 8, 11, 14],
18: [22, 24, 25, 26, 15],
19: [13, 26, 22, 90, 93],
20: [128, 129, 11, 134],
21: [119, 24, 121, 123, 25, 15],
22: [18, 19, 119, 25, 26, 93],
23: [16, 17, 7, 8, 11],
24: [18, 21, 121, 26, 123, 14, 15],
25: [18, 21, 22, 119, 15],
26: [7, 10, 12, 13, 14, 18, 19, 22, 24],
27: [33, 69, 75, 28, 29, 30, 31],
28: [32, 75, 46, 47, 48, 50, 52, 27, 30],
29: [48, 33, 73, 27, 31, 30, 41],
30: [48, 27, 28, 29],
31: [33, 69, 73, 27, 76, 29],
32: [42, 52, 28, 46],
33: [27, 29, 31],
34: [35, 149, 38, 40, 172],
35: [34, 36, 37, 38, 39, 40],
36: [48, 49, 35, 100, 39, 41, 47],
37: [35, 38, 39],
38: [34, 35, 37, 39, 168, 73, 74, 172],
39: [35, 36, 37, 38, 73, 74, 41],
40: [34, 35, 148, 149, 150],
41: [48, 49, 36, 39, 73, 29],
42: [32, 82, 52, 85, 43, 45, 46],
43: [42, 51, 52, 85],
44: [51, 85, 47],
45: [42, 75, 108, 46, 81, 82, 84, 86],
46: [32, 42, 75, 28, 45],
47: [36, 100, 101, 44, 48, 50, 51, 85, 28],
48: [49, 36, 41, 28, 29, 30, 47],
49: [48, 41, 36],
50: [51, 52, 28, 47],
51: [50, 52, 85, 43, 44, 47],
52: [32, 50, 51, 42, 43, 28],
53: [64, 54, 55, 57, 122, 59, 62],
54: [114, 53, 117, 120, 122, 62],
55: [0, 114, 4, 53, 56, 59, 62],
56: [65, 4, 6, 55, 58, 59, 124, 125],
57: [64, 113, 115, 53, 118, 122, 63],
58: [56, 60, 125, 6],
59: [64, 65, 53, 55, 56, 63],
60: [1, 125, 6, 58, 156, 61, 126],
61: [1, 156, 60],
62: [114, 53, 54, 55],
63: [64, 65, 115, 57, 59, 124],
64: [57, 59, 53, 63],
65: [56, 59, 124, 63],
66: [127],
67: [69, 70, 76, 78, 79],
68: [71, 72, 73, 76, 78],
69: [67, 75, 27, 76, 79, 31],
70: [146, 67, 75, 77, 78, 79],
71: [169, 68, 72, 73, 74],
72: [68, 164, 71, 169, 78],
73: [68, 38, 39, 71, 41, 74, 76, 29, 31],
74: [169, 38, 39, 168, 73, 71],
75: [193, 69, 70, 108, 45, 46, 79, 146, 27, 28],
76: [67, 68, 69, 73, 78, 31],
77: [78, 146, 70],
78: [67, 68, 164, 70, 72, 76, 77, 146],
79: [75, 67, 69, 70],
80: [103, 136, 81, 82, 83, 84, 85, 86],
81: [80, 82, 84, 45],
82: [80, 81, 85, 42, 45],
83: [80, 99, 85, 102, 136],
84: [80, 81, 45, 86],
85: [98, 101, 102, 42, 43, 44, 47, 80, 82, 51, 83],
86: [80, 84, 103, 108, 45],
87: [96, 97, 88, 89, 92, 94],
88: [96, 87, 151, 152, 94],
89: [96, 97, 87, 11, 91, 92],
90: [16, 19, 93, 9, 91, 92, 13],
91: [16, 89, 90, 11, 92],
92: [194, 142, 176, 87, 184, 89, 90, 91, 93, 94, 95],
93: [19, 22, 119, 184, 90, 92],
94: [87, 151, 153, 88, 92, 95],
95: [192, 194, 153, 92, 94],
96: [129, 97, 135, 11, 87, 88, 89, 152],
97: [96, 89, 87],
98: [101, 100, 85, 102],
99: [136, 83, 131, 102],
100: [98, 36, 101, 47],
101: [98, 100, 85, 47],
102: [83, 98, 99, 85],
103: [80, 86, 136, 108, 104, 111],
104: [103, 136, 106, 108, 111],
105: [153, 106, 107, 108, 109],
106: [108, 133, 136, 105, 104, 109, 110],
107: [153, 151, 105, 109, 110],
108: [193, 103, 104, 105, 106, 75, 45, 86, 153],
109: [105, 106, 107, 110],
110: [130, 132, 133, 151, 106, 107, 109],
111: [136, 104, 103],
112: [113, 123, 116, 117],
113: [112, 117, 118, 57, 122],
114: [0, 180, 55, 54, 119, 120, 62],
115: [57, 124, 118, 63],
116: [112, 121, 123, 117],
117: [112, 113, 116, 54, 120, 121, 122],
118: [113, 115, 57],
119: [114, 180, 21, 22, 184, 121, 120, 93, 25],
120: [114, 117, 54, 119, 121],
121: [116, 21, 117, 119, 24, 123, 120],
122: [113, 117, 53, 54, 57],
123: [112, 116, 21, 24, 121],
124: [65, 115, 56, 125, 126, 63],
125: [60, 56, 58, 124, 126],
126: [124, 161, 156, 157, 154, 60, 125],
127: [66, 130, 132, 133, 131],
128: [17, 11, 20, 134],
129: [96, 20, 134, 135, 11],
130: [132, 133, 110, 127],
131: [136, 99, 133, 127],
132: [130, 134, 151, 110, 127],
133: [130, 131, 136, 106, 110, 127],
134: [128, 129, 132, 20, 135, 151],
135: [96, 129, 134, 151, 152],
136: [99, 131, 133, 103, 104, 106, 111, 80, 83],
137: [144, 139, 140, 141],
138: [143, 179, 183, 139, 141, 175],
139: [137, 138, 141, 143],
140: [144, 137, 141],
141: [192, 183, 137, 138, 139, 140, 142],
142: [192, 176, 194, 181, 183, 92, 141],
143: [2, 3, 138, 139, 175],
144: [137, 140],
145: [193, 146, 147, 164, 173],
146: [193, 145, 164, 70, 75, 77, 78],
147: [145, 195, 193],
148: [162, 149, 166, 150, 40, 186, 187],
149: [34, 162, 148, 40, 172],
150: [40, 187, 148],
151: [132, 134, 135, 107, 110, 88, 153, 152, 94],
152: [96, 151, 88, 135],
153: [192, 193, 105, 107, 108, 151, 94, 95],
154: [161, 126, 155, 157, 158, 159],
155: [154, 156, 157, 159],
156: [157, 155, 60, 61, 126],
157: [154, 155, 156, 126],
158: [160, 161, 154, 159],
159: [160, 158, 154, 155, 189, 190],
160: [158, 190, 159],
161: [154, 126, 158],
162: [148, 149, 166, 167, 172],
163: [164, 167, 168, 169, 171, 172],
164: [163, 72, 169, 171, 173, 78, 145, 146],
165: [174, 166],
166: [162, 148, 165, 186, 188],
167: [162, 163, 170, 171, 172],
168: [163, 38, 169, 74, 172],
169: [163, 164, 71, 72, 74, 168],
170: [167],
171: [163, 164, 173, 167],
172: [34, 162, 163, 149, 38, 167, 168],
173: [145, 171, 164],
174: [165],
175: [2, 5, 138, 143, 177, 178, 179, 182],
176: [177, 178, 180, 181, 184, 92, 142],
177: [176, 178, 179, 181, 175],
178: [176, 177, 180, 5, 175],
179: [177, 181, 183, 138, 175],
180: [0, 176, 178, 114, 5, 119, 184],
181: [176, 177, 179, 183, 142],
182: [0, 2, 5, 175],
183: [179, 181, 138, 141, 142],
184: [176, 180, 119, 92, 93],
185: [186, 187, 188],
186: [148, 166, 185, 187, 188],
187: [148, 150, 185, 186, 188],
188: [185, 186, 187, 166],
189: [159, 190, 191],
190: [160, 189, 159],
191: [189],
192: [193, 194, 153, 141, 142, 95],
193: [192, 195, 75, 108, 145, 146, 147, 153],
194: [192, 92, 142, 95],
195: [193, 147]}
# the matrix of neighboorhood:
pd.DataFrame(*w_queen.full()).astype(int) # 1 means both are neighbors
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 191 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 192 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 193 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 194 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 195 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
196 rows × 196 columns
# pct of neighboorhood (density)
w_queen.pct_nonzero
2.7332361516034984
# a province with NO neighbor?
w_queen.islands
[]
Moran's correlation¶
Necesitamos la matriz de vecindad (la matriz de peso) para calcular la correlación espacial: si el valor de la variable está correlacionado con los valores de sus vecinos, lo que demuestra un efecto espacial.
# needed for spatial correlation
w_queen.transform = 'R'
pd.DataFrame(*w_queen.full()).sum(axis=1) # 1 means both are neighbors
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
...
191 1.0
192 1.0
193 1.0
194 1.0
195 1.0
Length: 196, dtype: float64
La correlación es medida por la "Moran's I statitic":
#!pip install esda
from esda.moran import Moran
moranCOVID = Moran(covid_provYear_Adultos_Mayores_map['date202101_qt'], w_queen)
moranCOVID.I,moranCOVID.p_sim
(0.0014287175342288713, 0.424)
La Moran's I es significante. Veamos:
#!pip install splot
from splot.esda import moran_scatterplot
import matplotlib.pyplot as plt
fig, ax = moran_scatterplot(moranCOVID)
ax.set_xlabel('Covid_PAM_share')
ax.set_ylabel('SpatialLag_Covid_PAM_share')
Text(0, 0.5, 'SpatialLag_Covid_PAM_share')
Local Spatial Correlation¶
Es posible que se detecte una correlación no significante. Veamos esos valores:
# The scatterplot with local info
from esda.moran import Moran_Local
# calculate Moran_Local and plot
lisaCOVID = Moran_Local(y=covid_provYear_Adultos_Mayores_map['date202101_qt'], w=w_knn,seed=2022)
fig, ax = moran_scatterplot(lisaCOVID,p=0.05)
ax.set_xlabel('COVID_PAM_share')
ax.set_ylabel('SpatialLag_COVID_PAM_share');
from splot.esda import plot_local_autocorrelation
plot_local_autocorrelation(lisaCOVID, covid_provYear_Adultos_Mayores_map,'date202101_qt')
plt.show()
# the map with the spots and outliers
from splot.esda import lisa_cluster
f, ax = plt.subplots(1, figsize=(12, 12))
plt.title('Spots and Outliers')
fig = lisa_cluster(lisaCOVID,
covid_provYear_Adultos_Mayores_map,ax=ax,
legend_kwds={'loc': 'center left',
'bbox_to_anchor': (0.7, 0.6)})
Añadimos esta data al gdf:
# quadrant
lisaCOVID.q
array([4, 2, 1, 1, 1, 2, 1, 4, 3, 3, 3, 3, 2, 4, 3, 1, 3, 3, 2, 4, 4, 2,
2, 3, 2, 2, 1, 1, 4, 1, 2, 4, 4, 2, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3,
3, 1, 4, 3, 2, 4, 4, 3, 3, 1, 4, 3, 4, 4, 1, 1, 4, 2, 4, 1, 2, 2,
2, 4, 2, 2, 4, 1, 4, 2, 4, 1, 3, 3, 4, 2, 1, 2, 1, 1, 1, 1, 1, 1,
1, 3, 3, 4, 2, 3, 1, 1, 3, 3, 1, 2, 4, 3, 1, 1, 4, 2, 3, 3, 1, 3,
3, 2, 4, 3, 2, 4, 2, 2, 2, 1, 4, 1, 4, 2, 2, 2, 1, 4, 4, 3, 4, 1,
3, 4, 4, 4, 2, 1, 1, 1, 2, 2, 1, 1, 1, 4, 4, 2, 4, 2, 3, 1, 3, 1,
1, 1, 1, 2, 1, 2, 1, 1, 4, 4, 1, 3, 3, 3, 1, 2, 2, 4, 4, 3, 3, 1,
1, 1, 3, 2, 2, 1, 1, 1, 3, 3, 3, 4, 3, 1, 1, 2, 1, 2, 1, 3])
# significance
lisaCOVID.p_sim
array([0.389, 0.102, 0.076, 0.103, 0.171, 0.036, 0.086, 0.069, 0.08 ,
0.382, 0.269, 0.052, 0.309, 0.04 , 0.105, 0.484, 0.05 , 0.215,
0.144, 0.035, 0.016, 0.04 , 0.298, 0.018, 0.124, 0.004, 0.427,
0.44 , 0.397, 0.288, 0.367, 0.257, 0.426, 0.337, 0.249, 0.251,
0.209, 0.291, 0.388, 0.248, 0.272, 0.17 , 0.336, 0.244, 0.308,
0.093, 0.448, 0.114, 0.294, 0.431, 0.061, 0.431, 0.305, 0.203,
0.184, 0.374, 0.38 , 0.213, 0.217, 0.447, 0.36 , 0.151, 0.095,
0.389, 0.244, 0.428, 0.378, 0.271, 0.369, 0.448, 0.267, 0.49 ,
0.445, 0.495, 0.321, 0.231, 0.392, 0.37 , 0.334, 0.474, 0.114,
0.032, 0.043, 0.233, 0.098, 0.434, 0.031, 0.348, 0.298, 0.069,
0.143, 0.016, 0.433, 0.199, 0.382, 0.04 , 0.056, 0.318, 0.361,
0.107, 0.33 , 0.282, 0.149, 0.385, 0.424, 0.437, 0.314, 0.46 ,
0.319, 0.446, 0.45 , 0.303, 0.225, 0.411, 0.221, 0.125, 0.031,
0.391, 0.469, 0.015, 0.097, 0.243, 0.188, 0.238, 0.48 , 0.381,
0.119, 0.432, 0.034, 0.42 , 0.418, 0.363, 0.132, 0.281, 0.162,
0.09 , 0.13 , 0.187, 0.01 , 0.02 , 0.189, 0.045, 0.16 , 0.084,
0.18 , 0.392, 0.419, 0.321, 0.021, 0.372, 0.042, 0.437, 0.1 ,
0.258, 0.095, 0.48 , 0.325, 0.196, 0.139, 0.17 , 0.275, 0.19 ,
0.041, 0.336, 0.295, 0.006, 0.103, 0.323, 0.423, 0.022, 0.133,
0.162, 0.107, 0.411, 0.007, 0.102, 0.2 , 0.238, 0.241, 0.024,
0.475, 0.246, 0.041, 0.04 , 0.102, 0.046, 0.141, 0.015, 0.298,
0.243, 0.222, 0.182, 0.117, 0.373, 0.318, 0.294])
# quadrant: 1 HH, 2 LH, 3 LL, 4 HL
pd.Series(lisaCOVID.q).value_counts()
1 59 3 49 4 44 2 44 Name: count, dtype: int64
La información en LisaCOVID.q no puede usarse de manera correcta, necesitamos añadir si la correlación local spatial es significante:
covid_provYear_Adultos_Mayores_map['COVID_quadrant']=[l if p <0.05 else 0 for l,p in zip(lisaCOVID.q,lisaCOVID.p_sim) ]
covid_provYear_Adultos_Mayores_map['COVID_quadrant'].value_counts()
COVID_quadrant 0 167 2 8 4 8 1 8 3 5 Name: count, dtype: int64
Ahora recodificamos
labels = [ '0 no_sig', '1 hotSpot', '2 coldOutlier', '3 coldSpot', '4 hotOutlier']
covid_provYear_Adultos_Mayores_map['COVID_quadrant_names']=[labels[i] for i in covid_provYear_Adultos_Mayores_map['COVID_quadrant']]
covid_provYear_Adultos_Mayores_map['COVID_quadrant_names'].value_counts()
COVID_quadrant_names 0 no_sig 167 2 coldOutlier 8 4 hotOutlier 8 1 hotSpot 8 3 coldSpot 5 Name: count, dtype: int64
Hacemos replot:
from matplotlib import colors
myColMap = colors.ListedColormap([ 'ghostwhite', 'red', 'green', 'black','orange'])
f, ax = plt.subplots(1, figsize=(12,12))
plt.title('Spots and Outliers')
covid_provYear_Adultos_Mayores_map.plot(column='COVID_quadrant_names',
categorical=True,
cmap=myColMap,
linewidth=0.1,
edgecolor='white',
legend=True,
legend_kwds={'loc': 'center left',
'bbox_to_anchor': (0.7, 0.6)},
ax=ax)
# Remove axis
ax.set_axis_off()
# Display the map
plt.show()
Bivariate LISA¶
#!pip install branca
#!pip install folium
covid_provYear_Adultos_Mayores_map.explore("COVID_quadrant_names", categorical=True,tooltip='location',cmap=myColMap)